Setting up the environment

# Packages
library(ggplot2)
library(dplyr)
library(plotly)
library(tidyr)
library(RColorBrewer)
library(limma)
library(gplots)
library(edgeR)
library(devtools)
library(ggbiplot)
library(DESeq)
library(geneplotter)
library(Rtsne)
set.seed(1234) # makes everything reproductible 
meta_data <- read.table("../Data/raw_data/meta_data.txt", header = TRUE)
dataInitial <- read.table("../Data/raw_data/expn_matrix.txt", header = TRUE, row.names = 1)
spike_data <- read.table("../Data/raw_data/expn_matrix_spike.txt", header = TRUE, row.names = 1)

Normalization

Kevin’s normalization

Here the data is normalized based on the spike_data we have. Although this seems to be correct, there could be some issues: See post.

spike_data <- rbind(spike_data, "Totals" = colSums(spike_data)) #finding sum of column and adding it as new row
scaled_totals <- spike_data["Totals", ]
average_reads <- rowMeans(scaled_totals)
scaled_totals <- rbind(scaled_totals, "Scale" = scaled_totals[1, ] / average_reads) #dividing total spike values by average spike values
dim_names <- list(row.names(dataInitial))       #setting dim_names for upcoming matrix
dim_names[2] <- list(colnames((dataInitial)))
normalized_data <- matrix(nrow = nrow(dataInitial), ncol = ncol(dataInitial), dimnames = dim_names) #pre-making a matrix
for(i in 1: ncol(dataInitial)){   #for loop which scales data and saves it as normalized_data
  normalized_data[ , i] <- dataInitial[ , i] / scaled_totals[2, i]
}
normalized_data <- as.data.frame(normalized_data)
#normalizing for cell size (cellular miRNA counts)
total_data <- rbind(spike_data, "Total Spike" = colSums(spike_data), "Total Reads" = colSums(dataInitial))
total_cellular <- total_data["Total Reads", ]
average_reads <- rowMeans(total_cellular)
cell_scales <- rbind(total_cellular, "Scale" = total_cellular[1, ] / average_reads) #dividing total spike values by average spike values
dim_names <- list(row.names(dataInitial))       #setting dim_names for upcoming matrix
dim_names[2] <- list(colnames((dataInitial)))
normalized_data2 <- matrix(nrow = nrow(dataInitial), ncol = ncol(dataInitial), dimnames = dim_names) #pre-making a matrix
for(i in 1: ncol(dataInitial)){   #for loop which scales data and saves it as normalized_data
  normalized_data2[ , i] <- normalized_data[ , i] / scaled_totals[2, i]
}
normalized_data2 <- as.data.frame(normalized_data2)
# deleting dead cells
alive_indexes <- list() #initializing lists
k <- 1  #initializing counter
for(i in 1:ncol(total_data)){
  if(total_data["Total Reads", i] > 20000){
    alive_indexes[k] <- i
    k <- k + 1
  }
}
alive_data <- normalized_data2[ , as.numeric(alive_indexes)]
alive_meta_data <- meta_data[as.numeric(alive_indexes), ]
#Saving completely normalized data as n_data and n_meta_data
n_data <- alive_data
n_meta_data <- alive_meta_data
#makes group and design matrix
data_day <- as.character(n_meta_data$Population)
data_day <- recode(data_day, "HL60D0" = "Day 0", "HL60D1" = "Day 1", "HL60D3" = "Day 3", "HL60D7" = "Day 7")
data_day <-factor(data_day)
design <- model.matrix(~0 + n_meta_data$Population)
#deletes "n_meta_data$Population" from name of columns
colnames(design) <- gsub("n_meta_data\\$Population", "", colnames(design))
#lets call intercept HL60D0
#colnames(design)[1] <- "HL60D0"
# Keeps genes without all zeros miRNA
dataNZ <- n_data[which(rowSums(n_data) > 0),] 
#making the contrast matrix
# let's suppose for now that we want to compare each group with each other ?????????????????????????
contrastMatrix <- makeContrasts(HL60D1-HL60D0,
                                HL60D3-HL60D0,
                                HL60D7-HL60D0,
                                HL60D1-HL60D3,
                                HL60D1-HL60D7,
                                HL60D3-HL60D7,
                                levels = c("HL60D0","HL60D1","HL60D3","HL60D7"))
#will not be needed: don't keep big data
remove(normalized_data)
remove(normalized_data2)
remove(alive_data)

Please note that with this normalization, we are only using cells ????????????????(alive cells?)

General normalization

To be sure that the normalization is good I will continue the analysis with a more general normalization. The following is based on this page. By general I mean not specific to pikes.

data_dayDESeq <- as.character(meta_data$Population)
data_dayDESeq <- recode(data_dayDESeq, "HL60D0" = "Day 0", "HL60D1" = "Day 1", "HL60D3" = "Day 3", "HL60D7" = "Day 7")
data_dayDESeq <-factor(data_dayDESeq)
deSeqDat <- newCountDataSet(dataInitial, data_dayDESeq)
# Note: actually it's not a real normalisation. Rather computing size factors depending on ratio of medians 
# if all size factors are roughly equal to one, the libraries have been sequenced equally deeply.
deSeqDat <- estimateSizeFactors(deSeqDat)
head(sizeFactors(deSeqDat))
HL60D0.C01_S1 HL60D0.C02_S2 HL60D0.C03_S3 HL60D0.C04_S4 HL60D0.C05_S5 HL60D0.C06_S6 
    0.5254052     1.1318130     0.9646220     0.3233751     1.3896389     0.9596427 
idx.nz <- apply(counts(deSeqDat), 1, function(x) { all(x > 0)})
nNZsamples <- sum(idx.nz)
#will not be needed: don't keep big data
remove(dataInitial)

We see that the number of non zero samples in all genes is very low compared to traditional RNA-seq: 5 nomrally in thousands. This may be a good reason to stick with the normalization with skie RNA.

#plotting the estimated dispersions against the mean normalized counts
deSeqDat <- estimateDispersions(deSeqDat)
plotDispEsts(deSeqDat)

multidensity( counts(deSeqDat, normalized = T),xlab="mean counts", xlim=c(0, 1000))

 multiecdf( counts(deSeqDat, normalized = T),xlab="mean counts", xlim=c(0, 1000))

The two charts above clearly shows that the second normalisation isn’t convincing. Indeed, the second chart assesses whether the normalization has worked, and the densities should overlapp since most of the genes are heavily affected by the experimental conditions. Note: The strange density chart could also be due to the fact that miRNA are very rarely expressed in every sample (as we have seen before).

Plots

PCA

PCA no standardization

First we use a log transformation to make the data approximatively follow the homoscedasticity assumption.

#Log transform
log_normalized_data <- log(dataNZ)
log_normalized_data[log_normalized_data == "-Inf"] <- 0
data_pca <- prcomp(t(log_normalized_data))
g <- ggbiplot(data_pca, 
              scale = 1, 
              obs.scale = 1, 
              varname.abbrev = FALSE,
              var.axes = FALSE,
              pc.biplot =TRUE,
              circle = TRUE, 
              groups = data_day, 
              ellipse= TRUE) +
  ggtitle("PCA without standardizing") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 
g 

#will not be needed: don't keep big data
#remove(g)
remove(data_pca)
remove(log_normalized_data)

Note: The PCA graph is interesting as it shows that after each day the clusters seem to go down in PC2 and extend their variance in PC1.

PCA standardized

Knowing the importance of having standardized data for PCA, we could think of standarding ours. This in general is a good idea, as veraibles are often not on the same scale and thus cannot be compared directly. Here, on revanche we have dimensions that are comparable as they each represent the number of miRNA in each cell. It is therefore a good idea not to standrdize, as it will unnecessarily decrease the variance explained. More information on this can be found on the following forums: (here)[http://stats.stackexchange.com/questions/105592/not-normalizing-data-before-pca-gives-better-explained-variance-ratio] or (here)[http://stats.stackexchange.com/questions/69157/why-do-we-need-to-normalize-data-before-analysis]. Out of curiosity we will still plot the standardized data:

#Log transform
log_normalized_dataNZ <- log(dataNZ)
log_normalized_dataNZ[log_normalized_dataNZ == "-Inf"] <- 0
data_pcaNZ <- prcomp(t(log_normalized_dataNZ), center = TRUE, scale. = TRUE)
g <- ggbiplot(data_pcaNZ, 
              scale = 1, 
              obs.scale = 1, 
              varname.abbrev = FALSE,
              var.axes = FALSE,
              pc.biplot =TRUE,
              circle = TRUE, 
              groups = data_day, 
              ellipse= TRUE) +
  ggtitle("PCA non zero genes") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 
g 

#will not be needed: don't keep big data
#remove(g)
remove(data_pcaNZ)
remove(log_normalized_dataNZ)

From this plot we see that as we’ve supposed, the variance explained is lower than in the first one. We thus conclude that we should stick with the first plot as we can (because on the same scale).

PCA top 50

Let’s now plot the top 50 genes

#arranging data based on total expression
indexTopFifty <- sort(rowSums(n_data), index=T, decreasing=TRUE)$ix[1:50]
topFifty <- n_data[indexTopFifty,]
# note: small difference with kenny probably come from the fact that he did log before and then took top fifty 
#Log transform
log_topFifty <- log(topFifty)
log_topFifty[log_topFifty == "-Inf"] <- 0
#PCA
data_topFifty <- prcomp(t(log_topFifty))
g <- ggbiplot(data_topFifty, 
              scale = 1, 
              obs.scale = 1, 
              varname.abbrev = FALSE,
              var.axes = FALSE,
              pc.biplot =TRUE,
              circle = TRUE, 
              groups = data_day, 
              ellipse= TRUE) +
  ggtitle("PCA top 50 genes") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 
g

#will not be needed: don't keep big data
#remove(g)
remove(data_topFifty)
remove(log_topFifty)
remove(topFifty)

Correlation heatmap

# Note: both plots of dataNZ and full data gives same correlation heatmap
normalized_data_cor <- cor(dataNZ)  #calculating sample-sample correlations
scaled_normalized_data_cor <- scale(normalized_data_cor)  #scaling for better contrast
limit <- 2
scaled_normalized_data_cor[which(scaled_normalized_data_cor > limit)] <- limit    #trimming for better contrast
scaled_normalized_data_cor[which(scaled_normalized_data_cor < -limit)] <- -limit
mypal <- rev(colorRampPalette(brewer.pal(11,"RdBu"))(100)) #setting colour palette
alphabet <- c("#b2df8a","#33a02c","#ff7f00","#fdbf6f","#b15928","#1f78b4","#6a3d9a","#e31a1c","#a6cee3","#cab2d6","#ffff99","#fb9a99")
pops <- alphabet[as.factor(data_day)] #setting population colour palette
heatmap.2(scaled_normalized_data_cor, Rowv = NA, Colv = NA, symm = T, #making a heatmap
          trace = "none", dendrogram = "none", col = mypal,
          labCol = NA, labRow = NA,
          cexCol = 0.45, cexRow = 0.45, main = "Sample-Sample Correlations",
          ColSideColors = pops) 

#will not be needed: don't keep big data
remove(normalized_data_cor)
remove(scaled_normalized_data_cor)
#remove(heatmapPlot)

In this heat map we clearly see the high correlation between the samples of day 0 and 1 and between the samples of day 3 and 7.

t-SNE plots

General t-SNE

Let’s plot the t-SNE plots of the cells that aren’t dead to see if we can find clusters and / or outliers. Note that we aren’t taking into account genes that aren’t expressed in any samples.

# t-SNE plot with alive cells and non zero genes
#ordering data by expn
alive_tsne_data <- cbind(dataNZ, "avg_expn" = rowMeans(dataNZ), "miRNA" = row.names(dataNZ)) 
alive_tsne_data <- dplyr::arrange(alive_tsne_data,desc(avg_expn))
alive_tsne_data <- t(alive_tsne_data)
colnames(alive_tsne_data) <- alive_tsne_data["miRNA", ]
#removes rows that were added for ordering
alive_tsne_data <- alive_tsne_data[!rownames(alive_tsne_data) %in% c("miRNA","avg_expn"), ]
tsne_out <- Rtsne(dist(alive_tsne_data)) # Run TSNE
# Show the objects in the 2D tsne representation
tSNETotal <- qplot(x=tsne_out$Y[,1], y=tsne_out$Y[,2],color=data_day) + 
  labs(colour = "Cell Type", x = "tsne1", y = "tsne2")+
  ggtitle("t-SNE alive cells and non zero genes") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 
tSNETotal

#will not be needed: don't keep big data
remove(tsne_out)
#remove(tSNETotal)

As with the correlation heatmap, we see that the samples from day 0 and 1 are clustered together while those from day 3 and 7 are clustered together.

t-SNE of Day 0 and Day 1

Now that we have seen that the samples from day 0 and 1 are closely related. Let’s try to take only those 2 in order to see if we can separate them from each other.

#tsne on Day 0/Day 1 cells only
indexDay01 <- which(data_day %in% c("Day 0","Day 1"))
group_one_data <- alive_tsne_data[indexDay01, ] #selecting only day 0 and day 1 cells
tsne_out <- Rtsne(dist(group_one_data)) # Run TSNE
# Show the objects in the 2D tsne representation
tsneDay01 <- qplot(x=tsne_out$Y[,1], y=tsne_out$Y[,2],color=data_day[indexDay01]) +
  labs(colour = "Cell Type", x = "tsne1", y = "tsne2") +
  ggtitle("t-SNE only day 0 and 1") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 
tsneDay01

#will not be needed: don't keep big data
remove(tsne_out)
#remove(tsneDay01)

The t-SNE can clearly not separate the 2 samples from each other. Finally let’s investigate whether they can be separated with a t-SNE in 3 dimension:

tsne_out <- Rtsne(dist(group_one_data), dims = 3) # Run TSNE in 3 dimensions
plot_ly(x=tsne_out$Y[,1],
        y=tsne_out$Y[,2],
        z=tsne_out$Y[,3],
        type="scatter3d",
        color=data_day[indexDay01],
        mode="markers") %>% 
  layout(title = 't-SNE day 0 and 1 3D')

#will not be needed: don't keep big data
remove(tsne_out)
remove(group_one_data)
#remove(tsneDay01)

We can still not distinguish both type of cells. This really shows how similar they are.

t-SNE of Day 0 and Day 1

Now let’s look at the cluster Day 3 and 7, to see if they can be distinguished.

#tsne on Day 0/Day 1 cells only
indexDay37 <- which(data_day %in% c("Day 3","Day 7"))
group_two_data <- alive_tsne_data[indexDay37, ] #selecting only day 0 and day 1 cells
tsne_out <- Rtsne(dist(group_two_data)) # Run TSNE
# Show the objects in the 2D tsne representation
tsneDay37 <- qplot(x=tsne_out$Y[,1], y=tsne_out$Y[,2],color=data_day[indexDay37]) +
  labs(colour = "Cell Type", x = "tsne1", y = "tsne2") +
  ggtitle("t-SNE only day 3 and 7") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 
tsneDay37

#will not be needed: don't keep big data
remove(tsne_out)
remove(alive_tsne_data)
#remove(tsneDay37)

As before, the 2 type of cells cannot be distinguished with a t-SNE plot in 2D. They will probably not be distinguishable in 3D but let’s be sure:

tsne_out <- Rtsne(dist(group_two_data), dims = 3) # Run TSNE in 3 dimensions
plot_ly(x=tsne_out$Y[,1],
        y=tsne_out$Y[,2],
        z=tsne_out$Y[,3],
        type="scatter3d",
        color=data_day[indexDay37],
        mode="markers") %>% 
  layout(title = 't-SNE day 3 and 7 3D')

#will not be needed: don't keep big data
remove(tsne_out)
remove(group_two_data)

Not suprisingly, the second cluster cannot be seprated further with 3 dimensional t-SNE.

Fit linear model

Note that for fitting linear models, we cannot use the DATASEQ library as it requires counts (i.e. discrete values). We now have continuous values because of they way we normalized the data.

Voom

Mean-Variance trend

Initial Mean-Variance tren

First let us visualize the mean-variance trend, to see.

#working ... HOW TO USE DATADESEQ ON NORMALIZED => NOT COUNTS ANY MORE?? + IS IT WORTH IT ? cAN we round ?
#dataDESeq <- rlog(as.matrix(data), blind=FALSE )
dge <- DGEList(counts=dataNZ, group = data_day)
# applies TMM normalization to dge
dge <- calcNormFactors(dge)
data_voomed <- voom(dge,design,plot=TRUE)

#will not be needed: don't keep big data
remove(dge)

The mean-variance trend is suprising: it doesn’t look flat enough. The first hyporthesis we had is that we there was a problem because we normalize by samples and then limma does it again. But not normailizing our data doesn’t change the curve of the plot. We then thought that maybe limma was doing something that required discrete data (as it is made for count data), our data isn’t discrete as we normailized it based on spike RNA. But rounding the data doesn’t change anything either. We will have to look deeper at the meaning of te curve of the plot and whether our data doesn’t follow the Voom assumptions. Note that the data may also be strange due to the nature of miRNA data, indeed in RNAseq we often have high value of counts but in miRNA the number of counts are much lower. This could cause the spike we see below x = zero (i.e. very low mean).

Lets check whether the strange mean variance is due to the clusters that we had.

df <- data_voomed$E
dataf<- mutate(data.frame(t(df)), group = data_day) %>% 
  gather(mirna, exp, - group) 
ggplot(dataf, aes(x=exp)) + 
  geom_histogram(aes(color = group)) +
  ggtitle("Initial istogram of counts") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

We see that our model seems to be bimodal but that the bimodality doesn’t seem to be due to the clusters.

Final Mean-Variance

We realized that is important that we use only the genes with no non zero values for every sample as we can see in paragraph 15.3 of (limma manual)[https://www.bioconductor.org/packages/devel/bioc/vignettes/limma/inst/doc/usersguide.pdf]. From the same article they say that as a rule of thumb is to retain rows that have at least 10 counts for a worthwhile number of samples.

dataNZFilterd <-dataNZ[rowSums(dataNZ>=10)>=round(0.3*ncol(dataNZ)),]
dge <- DGEList(counts=dataNZFilterd, group = data_day)
# applies TMM normalization to dge
#dge <- dge[isExpr,]
dge <- calcNormFactors(dge)
data_voomed <- voom(dge,design,plot=TRUE)

#will not be needed: don't keep big data
remove(dge)

It is now already a bit better. We also see that the mean variance trend seam to be decreasing very quickly and steadily, we can thus conclude that the experiment seem to be of low biological variation. For more information please see the “Removing heteroscedascity from count data” subpart of (this paper)[https://f1000research.com/articles/5-1408]

Here is an example of the data we have:

data_voomed[1:2,1:10]
An object of class "EList"
$targets
               group  lib.size norm.factors
HL60D0.C01_S1  Day 0  98643.51    0.9722808
HL60D0.C02_S2  Day 0  71913.77    0.9919387
HL60D0.C03_S3  Day 0  60085.97    1.0149911
HL60D0.C04_S4  Day 0 269051.46    0.9274848
HL60D0.C05_S5  Day 0  61654.79    1.0837952
HL60D0.C06_S6  Day 0  43943.05    1.0092054
HL60D0.C07_S7  Day 0 132684.23    0.9131704
HL60D0.C08_S8  Day 0  59127.47    1.1386215
HL60D0.C09_S10 Day 0 218444.65    1.0948769
HL60D0.C10_S11 Day 0  57710.01    1.2096569

$E
              HL60D0.C01_S1 HL60D0.C02_S2 HL60D0.C03_S3 HL60D0.C04_S4 HL60D0.C05_S5
hsa-let-7a-5p     13.406560     12.814684      12.92698     7.7084698      13.15058
hsa-let-7a-3p      2.341617      2.797568      10.12266     0.8940406      10.22792
              HL60D0.C06_S6 HL60D0.C07_S7 HL60D0.C08_S8 HL60D0.C09_S10 HL60D0.C10_S11
hsa-let-7a-5p      13.23362      13.04953     13.541487      12.720075      13.252301
hsa-let-7a-3p      10.25054       1.91392      9.819029       9.354078       9.812598

$weights
          [,1]      [,2]      [,3]      [,4]      [,5]      [,6]      [,7]      [,8]
[1,] 0.9303631 0.7013970 0.6018274 2.6953635 0.6149923 0.4656913 1.2368403 0.5937921
[2,] 0.1360968 0.1201292 0.1128750 0.2259304 0.1137440 0.1032960 0.1560068 0.1123739
          [,9]     [,10]
[1,] 2.1068200 0.5819193
[2,] 0.2012992 0.1116230

$design
   HL60D0 HL60D1 HL60D3 HL60D7
1       1      0      0      0
2       1      0      0      0
3       1      0      0      0
4       1      0      0      0
5       1      0      0      0
6       1      0      0      0
7       1      0      0      0
8       1      0      0      0
9       1      0      0      0
10      1      0      0      0

Final mean-variance bis

Thinking about it, we shouldn’t filter on having as good percentage of the samples that are non zero but rather having a majority of non zero in at least one group we have. Indeed that’s what really interests us at the end, because we are going to compare the groups!

indexDay0 <- which(data_day == "Day 0")
indexDay1 <- which(data_day == "Day 1")
indexDay3 <- which(data_day == "Day 3")
indexDay7 <- which(data_day == "Day 7")
dataNZFilterdBis <- dataNZ[rowSums(dataNZ[,indexDay0]>=10)>=round(0.50*length(indexDay0)) |
                          rowSums(dataNZ[,indexDay1]>=10)>=round(0.50*length(indexDay1)) |
                          rowSums(dataNZ[,indexDay3]>=10)>=round(0.50*length(indexDay3)) |
                          rowSums(dataNZ[,indexDay7]>=10)>=round(0.50*length(indexDay7)) ,]
dge <- DGEList(counts=dataNZFilterdBis, group = data_day)
# applies TMM normalization to dge
#dge <- dge[isExpr,]
dge <- calcNormFactors(dge)
data_voomed <- voom(dge,design,plot=TRUE)

#will not be needed: don't keep big data
remove(dge)

We see that it is very close to before but at least now there’s no subjectivity of “a worthile number of samples need to have more than 10 counts for that miRNA”.

Lets look now at the histogram to see how it changed it:

df <- data_voomed$E
dataf<- mutate(data.frame(t(df)), group = data_day) %>% 
  gather(mirna, exp, - group) 
ggplot(dataf, aes(x=exp)) + 
  geom_histogram(aes(color = group)) +
  ggtitle("Final histogram of counts") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

Why strange shape?: I just realized the reason of the strange shape: everything is due to the extreme variance at low count ! Indeed generally the mean variance plot shows a \(\sqrt(SD)\) that is around 1, but ours is around 2, but only at a low count. The reason is simple: the clusters! We can now explain all the curve: in the unfiltered mean-variance plot there was a strange ascent at the bigining, this is due to the fact that miRNA have a relatively low number of counts and is either on or off so we had a lot of zeros, these are now filtered out. Then there is a very big variance at low to average number of counts, this is due to the fact that the clusters are very strong in our samples, the miRNA are either on or off depending on the clusters (cluster 1: day 0 and 1, cluster 2: day 3 and 7) so when looking at possibly interesting miRNA they are often on in one of the cluster and off in the other this thus results in a relatively low number of counts but a very big variance (bimodal data). Fow high counts we see that the variance is really low, this is simply showing “not interesting” miRNA that are always very high and thus have a low variance!

One good way to go around that is to look differently at the genes that are always high and those that are only high in one of the clusters (when they are low in both they were already filtered out). Lets look at the plots to see if this hypothesis is true

Always high mirRNA
rowsAlwaysHigh <- rowSums(dataNZFilterdBis>=12.5)>=round(0.80*ncol(dataNZFilterdBis))
dataNZFilterAlwaysHigh <- dataNZFilterdBis[rowsAlwaysHigh,]
dge <- DGEList(counts=dataNZFilterAlwaysHigh, group = data_day)
# applies TMM normalization to dge
#dge <- dge[isExpr,]
dge <- calcNormFactors(dge)
data_voomedAH <- voom(dge,design,plot=TRUE)

#will not be needed: don't keep big data
remove(dge)

If the hypothesis is true then the histogram should be now normally distributed (no more binomial because the genes we are looking at are always highly expressed)

df <- data_voomedAH$E
dataf<- mutate(data.frame(t(df)), group = data_day) %>% 
  gather(mirna, exp, - group) 
ggplot(dataf, aes(x=exp)) + 
  geom_histogram(aes(color = group)) +
  ggtitle("Initial istogram of counts") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

miRNA high in a single group
dataNZFilterHighSingleGroup <- dataNZFilterdBis[!rowsAlwaysHigh,]
dge <- DGEList(counts=dataNZFilterHighSingleGroup, group = data_day)
# applies TMM normalization to dge
#dge <- dge[isExpr,]
dge <- calcNormFactors(dge)
data_voomedHSG <- voom(dge,design,plot=TRUE)

#will not be needed: don't keep big data
remove(dge)

Now here there should still be a bimodal distribution as when one cluster is high the other one will be low and vis versa.

df <- data_voomedHSG$E
dataf<- mutate(data.frame(t(df)), group = data_day) %>% 
  gather(mirna, exp, - group) 
ggplot(dataf, aes(x=exp)) + 
  geom_histogram(aes(color = group)) +
  ggtitle("Initial istogram of counts") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

we see that as hypthosized the histogramm is bimodal

Mean variance trend no log

NOTE: this is not important any more ! Note: Seeing the strange plot above makes us think that maybe our count data doesn’t need to be log transformed. Let’s investigate the relationship between the mean and the variance of ou data.

#working ... HOW TO USE DATADESEQ ON NORMALIZED => NOT COUNTS ANY MORE?? + IS IT WORTH IT ? cAN we round ?
#dataDESeq <- rlog(as.matrix(data), blind=FALSE )
#takes only genes that are non zero in many samples
#t <- n_data[which(rowSums(n_data) > 300),] 
#tries plotting directly without voom
m <- rowMeans(dataNZ)
RowSigmaSqrt <- function(x) {
  (rowSums((x - rowMeans(x))^2)/(dim(x)[2] - 1))^(1/4)
}
v <- RowSigmaSqrt(dataNZ)
plotxy <- data.frame(mean=m,sigmaSqrt=v)
ggplot(plotxy, aes(x=mean, y=sigmaSqrt)) +
    geom_point(shape=1) +    # Use hollow circles
    geom_smooth(method=lm,   # Add linear regression line
                se=FALSE) + # Don't add shaded confidence region
  ylim(0, 7500) + 
  xlim(0,7500) + 
  ggtitle("Mean variance relationship not Voom") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

From this plot we clearly see that our variance grows with fourth poqer of our mean. The reasoning behind this different relationship is still unclear to us (????????????? biologically), but it could indicate that we shouldn’t work with the default limma package and that we should maybe do the analysis manually (????????).

Fit linear model

fit <- lmFit(data_voomed, design)
fit <- contrasts.fit(fit, contrasts=contrastMatrix)
efit <- eBayes(fit)
topTable(efit, coef=colnames(coef(efit)))
plotSA(efit)

#will not be needed: don't keep big data
remove(fit)
remove(data_voomed)

Here there is a problem as we do not see the fitted line (?????).

Examining the number of DE genes

Let’s now look at differential expression levels of genes. Note that the adjusted p-value cutoff is 5%.

dt <- decideTests(efit)
summary(dt)
   HL60D1 - HL60D0 HL60D3 - HL60D0 HL60D7 - HL60D0 HL60D1 - HL60D3 HL60D1 - HL60D7
-1              20              13              70              58              33
0               96              76              30              59              29
1               10              37              26               9              64
   HL60D3 - HL60D7
-1              27
0               15
1               84

As we have already previously seen in the multiple plots of the previous section: most differential expressed genes are between day 0/1 and 3/7. We also note that a lot of genes in Day 3 and 7 are downregulated (compared to day 0 and 1).

Finding genes that are often differentially expressed compared to day 0

Let’s look at the genes that are always differentially expressed compared to day 0

# Finding genes that are not zero in every comparaison with day 0
commonDEgenesDay0 <- which(dt[,1]!=0 & dt[,2]!=0 & dt[,3]!=0)
names(commonDEgenesDay0)
 [1] "hsa-let-7a-3p"   "hsa-miR-132-3p"  "hsa-miR-143-3p"  "hsa-miR-17-3p"  
 [5] "hsa-miR-181b-5p" "hsa-miR-181b-3p" "hsa-miR-199b-5p" "hsa-miR-19b-3p" 
 [9] "hsa-miR-221-3p"  "hsa-miR-223-3p"  "hsa-miR-28-3p"   "hsa-miR-30d-5p" 
[13] "hsa-miR-30e-5p"  "hsa-miR-378a-3p" "hsa-miR-99b-5p" 

Let’s visualise it with a nice venn diagramm

vennDiagram(dt[,1:3], circle.col=c("turquoise", "salmon", "green"))

Finding genes that are often differentially between day 0/1 and day 3/7

Let’s look at the genes that are always differentially between day 0/1 and 3/7 as they seem to be the ones of interest.

# Finding genes that are not zero in every comparaison with day 0
commonDEgenesDay0137 <- which(dt[,2]!=0 & dt[,3]!=0 & dt[,4]!=0 & dt[,5]!=0)
names(commonDEgenesDay0137)
 [1] "hsa-let-7d-3p"   "hsa-let-7i-5p"   "hsa-miR-125a-5p" "hsa-miR-132-3p" 
 [5] "hsa-miR-143-3p"  "hsa-miR-145-5p"  "hsa-miR-181b-3p" "hsa-miR-185-5p" 
 [9] "hsa-miR-192-5p"  "hsa-miR-19b-3p"  "hsa-miR-223-3p"  "hsa-miR-23a-3p" 
[13] "hsa-miR-26a-5p"  "hsa-miR-30e-5p"  "hsa-miR-425-5p"  "hsa-miR-425-3p" 
[17] "hsa-miR-92a-3p" 

Let’s visualise it with a nice venn diagramm

vennDiagram(dt[,2:5], circle.col=c("turquoise", "salmon", "green","yellow"))

Finding genes that are differentially expressed between clusters but not whithin clusters

Finally what we are really interested in are genes that are differentially expressed between clusters but not whithin clusters: here are these genes:

# Finding genes that are differentially expressed between clusters but not whithin clusters
commonDEgenesBetweenNotWhithin <- which(dt[,2]!=0 & dt[,3]!=0 & dt[,4]!=0 & dt[,5]!=0 & dt[,1]==0 & dt[,6]==0)
names(commonDEgenesBetweenNotWhithin)
[1] "hsa-let-7d-3p"  "hsa-miR-192-5p" "hsa-miR-92a-3p"
---
title: "Summary work"
author: "Kevin Jepson, Stephanie Blain, Mackenzie Kinney, Garrett McCarthy, Kenneth Askelson and Yann Dubois"
output:
  html_notebook:
    toc: yes
header-includes: \usepackage{amsmath}
---

# Setting up the environment

```{r loading_data, results='hide', message=FALSE, warning=FALSE}
# Packages
library(ggplot2)
library(dplyr)
library(plotly)
library(tidyr)
library(RColorBrewer)
library(limma)
library(gplots)
library(edgeR)
library(devtools)
library(ggbiplot)
library(DESeq)
library(geneplotter)
library(Rtsne)

set.seed(1234) # makes everything reproductible 

meta_data <- read.table("../Data/raw_data/meta_data.txt", header = TRUE)
dataInitial <- read.table("../Data/raw_data/expn_matrix.txt", header = TRUE, row.names = 1)
spike_data <- read.table("../Data/raw_data/expn_matrix_spike.txt", header = TRUE, row.names = 1)
```

#Normalization 

##Kevin's normalization

Here the data is normalized based on the spike_data we have. Although this seems to be correct, there could be some issues: [See post](https://support.bioconductor.org/p/74870/).

```{r normalize_kevin}
spike_data <- rbind(spike_data, "Totals" = colSums(spike_data)) #finding sum of column and adding it as new row
scaled_totals <- spike_data["Totals", ]
average_reads <- rowMeans(scaled_totals)
scaled_totals <- rbind(scaled_totals, "Scale" = scaled_totals[1, ] / average_reads) #dividing total spike values by average spike values

dim_names <- list(row.names(dataInitial))       #setting dim_names for upcoming matrix
dim_names[2] <- list(colnames((dataInitial)))

normalized_data <- matrix(nrow = nrow(dataInitial), ncol = ncol(dataInitial), dimnames = dim_names) #pre-making a matrix

for(i in 1: ncol(dataInitial)){   #for loop which scales data and saves it as normalized_data
  normalized_data[ , i] <- dataInitial[ , i] / scaled_totals[2, i]
}
normalized_data <- as.data.frame(normalized_data)

#normalizing for cell size (cellular miRNA counts)
total_data <- rbind(spike_data, "Total Spike" = colSums(spike_data), "Total Reads" = colSums(dataInitial))
total_cellular <- total_data["Total Reads", ]
average_reads <- rowMeans(total_cellular)
cell_scales <- rbind(total_cellular, "Scale" = total_cellular[1, ] / average_reads) #dividing total spike values by average spike values

dim_names <- list(row.names(dataInitial))       #setting dim_names for upcoming matrix
dim_names[2] <- list(colnames((dataInitial)))

normalized_data2 <- matrix(nrow = nrow(dataInitial), ncol = ncol(dataInitial), dimnames = dim_names) #pre-making a matrix

for(i in 1: ncol(dataInitial)){   #for loop which scales data and saves it as normalized_data
  normalized_data2[ , i] <- normalized_data[ , i] / scaled_totals[2, i]
}
normalized_data2 <- as.data.frame(normalized_data2)

# deleting dead cells

alive_indexes <- list() #initializing lists
k <- 1  #initializing counter
for(i in 1:ncol(total_data)){
  if(total_data["Total Reads", i] > 20000){
    alive_indexes[k] <- i
    k <- k + 1
  }
}

alive_data <- normalized_data2[ , as.numeric(alive_indexes)]
alive_meta_data <- meta_data[as.numeric(alive_indexes), ]

#Saving completely normalized data as n_data and n_meta_data
n_data <- alive_data
n_meta_data <- alive_meta_data

#makes group and design matrix
data_day <- as.character(n_meta_data$Population)
data_day <- recode(data_day, "HL60D0" = "Day 0", "HL60D1" = "Day 1", "HL60D3" = "Day 3", "HL60D7" = "Day 7")
data_day <-factor(data_day)

design <- model.matrix(~0 + n_meta_data$Population)
#deletes "n_meta_data$Population" from name of columns
colnames(design) <- gsub("n_meta_data\\$Population", "", colnames(design))
#lets call intercept HL60D0
#colnames(design)[1] <- "HL60D0"

# Keeps genes without all zeros miRNA
dataNZ <- n_data[which(rowSums(n_data) > 0),] 

#making the contrast matrix
# let's suppose for now that we want to compare each group with each other ?????????????????????????
contrastMatrix <- makeContrasts(HL60D1-HL60D0,
                                HL60D3-HL60D0,
                                HL60D7-HL60D0,
                                HL60D1-HL60D3,
                                HL60D1-HL60D7,
                                HL60D3-HL60D7,
                                levels = c("HL60D0","HL60D1","HL60D3","HL60D7"))

#will not be needed: don't keep big data
remove(normalized_data)
remove(normalized_data2)
remove(alive_data)
```

Please note that with this normalization, we are only using cells ????????????????(alive cells?)

##General normalization 

To be sure that the normalization is good I will continue the analysis with a more general normalization. The following is based on [this page](http://www-huber.embl.de/users/klaus/Teaching/DESeq2Predoc2014.html#pca-and-sample-heatmaps). By general I mean not specific to pikes.

```{r normalize_DESeq}
data_dayDESeq <- as.character(meta_data$Population)
data_dayDESeq <- recode(data_dayDESeq, "HL60D0" = "Day 0", "HL60D1" = "Day 1", "HL60D3" = "Day 3", "HL60D7" = "Day 7")
data_dayDESeq <-factor(data_dayDESeq)

deSeqDat <- newCountDataSet(dataInitial, data_dayDESeq)

# Note: actually it's not a real normalisation. Rather computing size factors depending on ratio of medians 
# if all size factors are roughly equal to one, the libraries have been sequenced equally deeply.

deSeqDat <- estimateSizeFactors(deSeqDat)
head(sizeFactors(deSeqDat))

idx.nz <- apply(counts(deSeqDat), 1, function(x) { all(x > 0)})
nNZsamples <- sum(idx.nz)

#will not be needed: don't keep big data
remove(dataInitial)
```

We see that the number of non zero samples in all genes is very low compared to traditional RNA-seq: `r nNZsamples` nomrally in thousands. This may be a good reason to stick with the normalization with skie RNA.

```{r plott_dispersion}
#plotting the estimated dispersions against the mean normalized counts
deSeqDat <- estimateDispersions(deSeqDat)
plotDispEsts(deSeqDat)
multidensity( counts(deSeqDat, normalized = T),xlab="mean counts", xlim=c(0, 1000))
```

```{r plott_density}
 multiecdf( counts(deSeqDat, normalized = T),xlab="mean counts", xlim=c(0, 1000))
```

The two charts above clearly shows that the second normalisation isn't convincing. Indeed, the second chart assesses whether the normalization has worked, and the densities should overlapp since most of the genes are heavily affected by the experimental conditions. Note: The strange density chart could also be due to the fact that miRNA are very rarely expressed in every sample (as we have seen before).


# Plots

## PCA

### PCA no standardization

First we use a log transformation to make the data approximatively follow the homoscedasticity assumption. 
```{r PCA}
#Log transform
log_normalized_data <- log(dataNZ)
log_normalized_data[log_normalized_data == "-Inf"] <- 0

data_pca <- prcomp(t(log_normalized_data))
g <- ggbiplot(data_pca, 
              scale = 1, 
              obs.scale = 1, 
              varname.abbrev = FALSE,
              var.axes = FALSE,
              pc.biplot =TRUE,
              circle = TRUE, 
              groups = data_day, 
              ellipse= TRUE) +
  ggtitle("PCA without standardizing") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

g 

#will not be needed: don't keep big data
#remove(g)
remove(data_pca)
remove(log_normalized_data)
```
Note: The PCA graph is interesting as it shows that after each day the clusters seem to go down in PC2 and extend their variance in PC1.

### PCA standardized

Knowing the importance of having standardized data for PCA, we could think of standarding ours. This in general is a good idea, as veraibles are often not on the same scale and thus cannot be compared directly. Here, on revanche we have dimensions that are comparable as they each represent the number of miRNA in each cell. It is therefore a good idea not to standrdize, as it will unnecessarily decrease the variance explained. More information on this can be found on the following forums: (here)[http://stats.stackexchange.com/questions/105592/not-normalizing-data-before-pca-gives-better-explained-variance-ratio] or (here)[http://stats.stackexchange.com/questions/69157/why-do-we-need-to-normalize-data-before-analysis]. Out of curiosity we will still plot the standardized data:

```{r PCA_NZ}
#Log transform
log_normalized_dataNZ <- log(dataNZ)
log_normalized_dataNZ[log_normalized_dataNZ == "-Inf"] <- 0

data_pcaNZ <- prcomp(t(log_normalized_dataNZ), center = TRUE, scale. = TRUE)
g <- ggbiplot(data_pcaNZ, 
              scale = 1, 
              obs.scale = 1, 
              varname.abbrev = FALSE,
              var.axes = FALSE,
              pc.biplot =TRUE,
              circle = TRUE, 
              groups = data_day, 
              ellipse= TRUE) +
  ggtitle("PCA non zero genes") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

g 

#will not be needed: don't keep big data
#remove(g)
remove(data_pcaNZ)
remove(log_normalized_dataNZ)
```

From this plot we see that as we've supposed, the variance explained is lower than in the first one. We thus conclude that we should stick with the first plot as we can (because on the same scale).

### PCA top 50 

Let's now plot the top 50 genes

```{r PCA_top50}
#arranging data based on total expression
indexTopFifty <- sort(rowSums(n_data), index=T, decreasing=TRUE)$ix[1:50]
topFifty <- n_data[indexTopFifty,]
# note: small difference with kenny probably come from the fact that he did log before and then took top fifty 

#Log transform
log_topFifty <- log(topFifty)
log_topFifty[log_topFifty == "-Inf"] <- 0

#PCA
data_topFifty <- prcomp(t(log_topFifty))
g <- ggbiplot(data_topFifty, 
              scale = 1, 
              obs.scale = 1, 
              varname.abbrev = FALSE,
              var.axes = FALSE,
              pc.biplot =TRUE,
              circle = TRUE, 
              groups = data_day, 
              ellipse= TRUE) +
  ggtitle("PCA top 50 genes") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

g

#will not be needed: don't keep big data
#remove(g)
remove(data_topFifty)
remove(log_topFifty)
remove(topFifty)
```

## Correlation heatmap
```{r correlation_heatmap}
# Note: both plots of dataNZ and full data gives same correlation heatmap
normalized_data_cor <- cor(dataNZ)  #calculating sample-sample correlations
scaled_normalized_data_cor <- scale(normalized_data_cor)  #scaling for better contrast

limit <- 2
scaled_normalized_data_cor[which(scaled_normalized_data_cor > limit)] <- limit    #trimming for better contrast
scaled_normalized_data_cor[which(scaled_normalized_data_cor < -limit)] <- -limit

mypal <- rev(colorRampPalette(brewer.pal(11,"RdBu"))(100)) #setting colour palette
alphabet <- c("#b2df8a","#33a02c","#ff7f00","#fdbf6f","#b15928","#1f78b4","#6a3d9a","#e31a1c","#a6cee3","#cab2d6","#ffff99","#fb9a99")
pops <- alphabet[as.factor(data_day)] #setting population colour palette

heatmap.2(scaled_normalized_data_cor, Rowv = NA, Colv = NA, symm = T, #making a heatmap
          trace = "none", dendrogram = "none", col = mypal,
          labCol = NA, labRow = NA,
          cexCol = 0.45, cexRow = 0.45, main = "Sample-Sample Correlations",
          ColSideColors = pops) 

#will not be needed: don't keep big data
remove(normalized_data_cor)
remove(scaled_normalized_data_cor)
#remove(heatmapPlot)
```

In this heat map we clearly see the high correlation between the samples of day 0 and 1 and between the samples of day 3 and 7.


## t-SNE plots

### General t-SNE

Let's plot the t-SNE plots of the cells that aren't dead to see if we can find clusters and / or outliers. Note that we aren't taking into account genes that aren't expressed in any samples.
```{r tsne_General}
# t-SNE plot with alive cells and non zero genes
#ordering data by expn
alive_tsne_data <- cbind(dataNZ, "avg_expn" = rowMeans(dataNZ), "miRNA" = row.names(dataNZ)) 
alive_tsne_data <- dplyr::arrange(alive_tsne_data,desc(avg_expn))
alive_tsne_data <- t(alive_tsne_data)
colnames(alive_tsne_data) <- alive_tsne_data["miRNA", ]
#removes rows that were added for ordering
alive_tsne_data <- alive_tsne_data[!rownames(alive_tsne_data) %in% c("miRNA","avg_expn"), ]

tsne_out <- Rtsne(dist(alive_tsne_data)) # Run TSNE
# Show the objects in the 2D tsne representation
tSNETotal <- qplot(x=tsne_out$Y[,1], y=tsne_out$Y[,2],color=data_day) + 
  labs(colour = "Cell Type", x = "tsne1", y = "tsne2")+
  ggtitle("t-SNE alive cells and non zero genes") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

tSNETotal

#will not be needed: don't keep big data
remove(tsne_out)
#remove(tSNETotal)
```

As with the correlation heatmap, we see that  the samples from day 0 and 1 are clustered together while those from day 3 and 7 are clustered together.

### t-SNE of Day 0 and Day 1

Now that we have seen that the samples from day 0 and 1 are closely related. Let's try to take only those 2 in order to see if we can separate them from each other. 

```{r tsne_day01}
#tsne on Day 0/Day 1 cells only
indexDay01 <- which(data_day %in% c("Day 0","Day 1"))
group_one_data <- alive_tsne_data[indexDay01, ] #selecting only day 0 and day 1 cells
tsne_out <- Rtsne(dist(group_one_data)) # Run TSNE
# Show the objects in the 2D tsne representation
tsneDay01 <- qplot(x=tsne_out$Y[,1], y=tsne_out$Y[,2],color=data_day[indexDay01]) +
  labs(colour = "Cell Type", x = "tsne1", y = "tsne2") +
  ggtitle("t-SNE only day 0 and 1") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

tsneDay01

#will not be needed: don't keep big data
remove(tsne_out)
#remove(tsneDay01)
```

The t-SNE can clearly not separate the 2 samples from each other. Finally let's investigate whether they can be separated with a t-SNE in 3 dimension:

```{r tsne_day01_3D}
tsne_out <- Rtsne(dist(group_one_data), dims = 3) # Run TSNE in 3 dimensions

plot_ly(x=tsne_out$Y[,1],
        y=tsne_out$Y[,2],
        z=tsne_out$Y[,3],
        type="scatter3d",
        color=data_day[indexDay01],
        mode="markers") %>% 
  layout(title = 't-SNE day 0 and 1 3D')

#will not be needed: don't keep big data
remove(tsne_out)
remove(group_one_data)
#remove(tsneDay01)
```

We can still not distinguish both type of cells. This really shows how similar they are.

### t-SNE of Day 0 and Day 1

Now let's look at the cluster Day 3 and 7, to see if they can be distinguished.

```{r tsne_day37}
#tsne on Day 0/Day 1 cells only
indexDay37 <- which(data_day %in% c("Day 3","Day 7"))
group_two_data <- alive_tsne_data[indexDay37, ] #selecting only day 0 and day 1 cells
tsne_out <- Rtsne(dist(group_two_data)) # Run TSNE
# Show the objects in the 2D tsne representation
tsneDay37 <- qplot(x=tsne_out$Y[,1], y=tsne_out$Y[,2],color=data_day[indexDay37]) +
  labs(colour = "Cell Type", x = "tsne1", y = "tsne2") +
  ggtitle("t-SNE only day 3 and 7") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

tsneDay37

#will not be needed: don't keep big data
remove(tsne_out)
remove(alive_tsne_data)
#remove(tsneDay37)
```

As before, the 2 type of cells cannot be distinguished with a t-SNE plot in 2D. They will probably not be distinguishable in 3D but let's be sure:

```{r tsne_day37_3D}
tsne_out <- Rtsne(dist(group_two_data), dims = 3) # Run TSNE in 3 dimensions

plot_ly(x=tsne_out$Y[,1],
        y=tsne_out$Y[,2],
        z=tsne_out$Y[,3],
        type="scatter3d",
        color=data_day[indexDay37],
        mode="markers") %>% 
  layout(title = 't-SNE day 3 and 7 3D')

#will not be needed: don't keep big data
remove(tsne_out)
remove(group_two_data)
```

Not suprisingly, the second cluster cannot be seprated further with 3 dimensional t-SNE.


# Fit linear model

Note that for fitting linear models, we cannot use the DATASEQ library as it requires counts (i.e. discrete values). We now have continuous values because of they way we normalized the data.

## Voom

### Mean-Variance trend

#### Initial Mean-Variance tren

First let us visualize the mean-variance trend, to see. 

```{r intial_mean_variance}
#working ... HOW TO USE DATADESEQ ON NORMALIZED => NOT COUNTS ANY MORE?? + IS IT WORTH IT ? cAN we round ?
#dataDESeq <- rlog(as.matrix(data), blind=FALSE )

dge <- DGEList(counts=dataNZ, group = data_day)
# applies TMM normalization to dge

dge <- calcNormFactors(dge)
data_voomed <- voom(dge,design,plot=TRUE)

#will not be needed: don't keep big data
remove(dge)
```

The mean-variance trend is suprising: it doesn't look flat enough. The first hyporthesis we had is that we there was a problem because we normalize by samples and then limma does it again. But not normailizing our data doesn't change the curve of the plot. We then thought that maybe limma was doing something that required discrete data (as it is made for count data), our data isn't discrete as we normailized it based on spike RNA. But rounding the data doesn't change anything either. We will have to look deeper at the meaning of te curve of the plot and whether our data doesn't follow the Voom assumptions. Note that the data may also be strange due to the nature of miRNA data, indeed in RNAseq we often have high value of counts but in miRNA the number of counts are much lower. This could cause the spike we see below x = zero (i.e. very low mean).

Lets check whether the strange mean variance is due to the clusters that we had.

```{r histogramm_voom}
df <- data_voomed$E
dataf<- mutate(data.frame(t(df)), group = data_day) %>% 
  gather(mirna, exp, - group) 

ggplot(dataf, aes(x=exp)) + 
  geom_histogram(aes(color = group)) +
  ggtitle("Initial istogram of counts") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

#will not be needed: don't keep big data
remove(dataf)
remove(df)
```

We see that our model seems to be bimodal but that the bimodality doesn't seem to be due to the clusters.

#### Final Mean-Variance
We realized that is important that we use only the genes with no non zero values for every sample as we can see in paragraph 15.3 of (limma manual)[https://www.bioconductor.org/packages/devel/bioc/vignettes/limma/inst/doc/usersguide.pdf]. From the same article they say that as a rule of thumb is to retain rows that have at least 10 counts for a worthwhile number of samples. 

```{r final_mean_variance}
dataNZFilterd <-dataNZ[rowSums(dataNZ>=10)>=round(0.3*ncol(dataNZ)),]

dge <- DGEList(counts=dataNZFilterd, group = data_day)
# applies TMM normalization to dge

#dge <- dge[isExpr,]
dge <- calcNormFactors(dge)
data_voomed <- voom(dge,design,plot=TRUE)

#will not be needed: don't keep big data
remove(dge)
remove(dataNZFilterd)
```

It is now already a bit better. We also see that the mean variance trend seam to be decreasing very quickly and steadily, we can thus conclude that the experiment seem to be of low biological variation. For more information please see the "Removing heteroscedascity from count data" subpart of (this paper)[https://f1000research.com/articles/5-1408]

Here is an example of the data we have:

```{r }
data_voomed[1:2,1:10]
```

#### Final mean-variance bis

Thinking about it, we shouldn't filter on having as good percentage of the samples that are non zero but rather having a majority of non zero in at least one group we have. Indeed that's what really interests us at the end, because we are going to compare the groups!

```{r final_mean_variance_bis}
indexDay0 <- which(data_day == "Day 0")
indexDay1 <- which(data_day == "Day 1")
indexDay3 <- which(data_day == "Day 3")
indexDay7 <- which(data_day == "Day 7")

dataNZFilterdBis <- dataNZ[rowSums(dataNZ[,indexDay0]>=10)>=round(0.50*length(indexDay0)) |
                          rowSums(dataNZ[,indexDay1]>=10)>=round(0.50*length(indexDay1)) |
                          rowSums(dataNZ[,indexDay3]>=10)>=round(0.50*length(indexDay3)) |
                          rowSums(dataNZ[,indexDay7]>=10)>=round(0.50*length(indexDay7)) ,]

dge <- DGEList(counts=dataNZFilterdBis, group = data_day)
# applies TMM normalization to dge
dge <- calcNormFactors(dge)
data_voomed <- voom(dge,design,plot=TRUE)

#will not be needed: don't keep big data
remove(dge)
```

We see that it is very close to before but at least now there's no subjectivity of "a worthile number of samples need to have more than 10 counts for that miRNA".

Lets look now at the histogram to see how it changed it:

```{r Final_histogramm_voom}
df <- data_voomed$E
dataf<- mutate(data.frame(t(df)), group = data_day) %>% 
  gather(mirna, exp, - group) 

ggplot(dataf, aes(x=exp)) + 
  geom_histogram(aes(color = group)) +
  ggtitle("Final histogram of counts") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

#will not be needed: don't keep big data
remove(dataf)
remove(df)
```

**Why strange shape?**:
I just realized the reason of the strange shape: everything is due to the extreme variance at low count ! Indeed generally the mean variance plot shows a $\sqrt(SD)$ that is around 1, but ours is around 2, but only at a low count. The reason is simple: the clusters! We can now explain all the curve: in the unfiltered mean-variance plot there was a strange ascent at the bigining, this is due to the fact that miRNA have a relatively low number of counts and is either on or off so we had a lot of zeros, these are now filtered out. Then there is a very big variance at low to average number of counts, this is due to the fact that the clusters are very strong in our samples, the miRNA are either on or off depending on the clusters (cluster 1: day 0 and 1, cluster 2: day 3 and 7) so when looking at possibly interesting miRNA they are often on in one of the cluster and off in the other this thus results in a relatively low number of counts but a very big variance (bimodal data). Fow high counts we see that the variance is really low, this is simply showing "not interesting" miRNA that are always very high and thus have a low variance!

One good way to go around that is to look differently at the genes that are always high and those that are only high in one of the clusters (when they are low in both they were already filtered out). Lets look at the plots to see if this hypothesis is true

##### Always high mirRNA

```{r mean_variance_alwaysHigh}
rowsAlwaysHigh <- rowSums(dataNZFilterdBis>=12.5)>=round(0.80*ncol(dataNZFilterdBis))
dataNZFilterAlwaysHigh <- dataNZFilterdBis[rowsAlwaysHigh,]

dge <- DGEList(counts=dataNZFilterAlwaysHigh, group = data_day)
# applies TMM normalization to dge

#dge <- dge[isExpr,]
dge <- calcNormFactors(dge)
data_voomedAH <- voom(dge,design,plot=TRUE)

#will not be needed: don't keep big data
remove(dge)
```

If the hypothesis is true then the histogram should be now normally distributed (no more binomial because the genes we are looking at are always highly expressed)

```{r histogramm_voom_alwaysHigh}
df <- data_voomedAH$E
dataf<- mutate(data.frame(t(df)), group = data_day) %>% 
  gather(mirna, exp, - group) 

ggplot(dataf, aes(x=exp)) + 
  geom_histogram(aes(color = group)) +
  ggtitle("Initial istogram of counts") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

#will not be needed: don't keep big data
remove(dataf)
remove(df)
```

##### miRNA high in a single group

```{r mean_variance_highSingleGroup}
dataNZFilterHighSingleGroup <- dataNZFilterdBis[!rowsAlwaysHigh,]

dge <- DGEList(counts=dataNZFilterHighSingleGroup, group = data_day)
# applies TMM normalization to dge

#dge <- dge[isExpr,]
dge <- calcNormFactors(dge)
data_voomedHSG <- voom(dge,design,plot=TRUE)

#will not be needed: don't keep big data
remove(dge)
```


Now here there should still be a bimodal distribution as when one cluster is high the other one will be low and vis versa.

```{r histogramm_voom_highSingleGroup}
df <- data_voomedHSG$E
dataf<- mutate(data.frame(t(df)), group = data_day) %>% 
  gather(mirna, exp, - group) 

ggplot(dataf, aes(x=exp)) + 
  geom_histogram(aes(color = group)) +
  ggtitle("Initial istogram of counts") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

#will not be needed: don't keep big data
remove(dataf)
remove(df)
```
we see that as hypthosized the histogramm is bimodal

#### Mean variance trend no log

NOTE: this is not important any more !
Note: Seeing the strange plot above makes us think that maybe our count data doesn't need to be log transformed. Let's investigate the relationship between the mean and the variance of ou data.

```{r sqrt4_mean-variance}
#working ... HOW TO USE DATADESEQ ON NORMALIZED => NOT COUNTS ANY MORE?? + IS IT WORTH IT ? cAN we round ?
#dataDESeq <- rlog(as.matrix(data), blind=FALSE )

#takes only genes that are non zero in many samples
#t <- n_data[which(rowSums(n_data) > 300),] 

#tries plotting directly without voom
m <- rowMeans(dataNZ)
RowSigmaSqrt <- function(x) {
  (rowSums((x - rowMeans(x))^2)/(dim(x)[2] - 1))^(1/4)
}
v <- RowSigmaSqrt(dataNZ)

plotxy <- data.frame(mean=m,sigmaSqrt=v)
ggplot(plotxy, aes(x=mean, y=sigmaSqrt)) +
    geom_point(shape=1) +    # Use hollow circles
    geom_smooth(method=lm,   # Add linear regression line
                se=FALSE) + # Don't add shaded confidence region
  ylim(0, 7500) + 
  xlim(0,7500) + 
  ggtitle("Mean variance relationship not Voom") +
  theme(text = element_text(size=14),plot.title = element_text(hjust = 0.5)) 

#will not be needed: don't keep big data
remove(plotxy)
```

From this plot we clearly see that our variance grows with fourth poqer of our mean. The reasoning behind this different relationship is still unclear to us (????????????? biologically), but it could indicate that we shouldn't work with the default limma package and that we should maybe do the analysis manually (????????).  

### Fit linear model

```{r test_linear_model }

fit <- lmFit(data_voomed, design)
fit <- contrasts.fit(fit, contrasts=contrastMatrix)
efit <- eBayes(fit)
topTable(efit, coef=colnames(coef(efit)))
plotSA(efit)

#will not be needed: don't keep big data
remove(fit)
remove(data_voomed)
```

Here there is a problem as we do not see the fitted line (?????).


#### Examining the number of DE genes

Let's now look at differential expression levels of genes. Note that the adjusted p-value cutoff is 5%.

```{r test_linear_model_summary }
dt <- decideTests(efit)
summary(dt)
```

As we have already previously seen in the multiple plots of the previous section: most differential expressed genes are between day 0/1 and 3/7. We also note that a lot of genes in Day 3 and 7 are downregulated (compared to day 0 and 1).

#### Finding genes that are often differentially expressed compared to day 0

Let's look at the genes that are always differentially expressed compared to day 0 

```{r common_DE_genes_day0 }
# Finding genes that are not zero in every comparaison with day 0
commonDEgenesDay0 <- which(dt[,1]!=0 & dt[,2]!=0 & dt[,3]!=0)
names(commonDEgenesDay0)
```

Let's visualise it with a nice venn diagramm

```{r day0_DE_venn }
vennDiagram(dt[,1:3], circle.col=c("turquoise", "salmon", "green"))
```

#### Finding genes that are often differentially between day 0/1 and day 3/7

Let's look at the genes that are always differentially between day 0/1 and 3/7 as they seem to be the ones of interest.

```{r common_DE_genes_day01-37 }
# Finding genes that are not zero in every comparaison with day 0
commonDEgenesDay0137 <- which(dt[,2]!=0 & dt[,3]!=0 & dt[,4]!=0 & dt[,5]!=0)
names(commonDEgenesDay0137)
```

Let's visualise it with a nice venn diagramm

```{r day0137_DE_venn }
vennDiagram(dt[,2:5], circle.col=c("turquoise", "salmon", "green","yellow"))
```

#### Finding genes that are differentially expressed between clusters but not whithin clusters

Finally what we are really interested in are genes that are differentially expressed between clusters but not whithin clusters: here are these genes:

```{r common_DE_genes_betweenCLusters_notWhithin }
# Finding genes that are differentially expressed between clusters but not whithin clusters
commonDEgenesBetweenNotWhithin <- which(dt[,2]!=0 & dt[,3]!=0 & dt[,4]!=0 & dt[,5]!=0 & dt[,1]==0 & dt[,6]==0)
names(commonDEgenesBetweenNotWhithin)
```